Intro: Use DAVINCHI

In this vignette, we demonstrate how to use DAVINCHI to

To run this vignette, we first load the package.

library(DAVINCHI)
library(data.table)


Load the data

Here we use the two mouse brain slices as the working example, and we start by loading the first slice (V1).

gene.exp <- fread("~/functional/Globus/RongLab/MouseBrain/V1transcriptome/whole.tsv", header = T, sep = "\t")
row.names <- as.character(gene.exp$V1)
gene.exp <- as.data.frame(gene.exp[,2:ncol(gene.exp)])
rownames(gene.exp) <- row.names
gene.exp <- t(gene.exp)


t1 <- apply(gene.exp,2,function(x){sum(x!=0)})
gene.exp.V1 <- gene.exp[,which(t1!=0)]


#get the coordinates
col.names <- colnames(gene.exp.V1)
x <- as.numeric(unlist(lapply(strsplit(col.names, "x"), function(x){x[1]})))
y <- as.numeric(unlist(lapply(strsplit(col.names, "x"), function(x){x[2]})))
coor.V1 <- cbind(x, y)
colnames(coor.V1) <- c("array_row", "array_col")
rownames(coor.V1) <- col.names


#gene.exp.V1, coor.V1

#add the atac information
input.atac.V1 <- readRDS("~/Desktop/KPMP/code/PLIER_Ahmed/speed_sylvester/MultiModality/SpatialCoAssay/RongLab/input/MouseBrain_V2/ATAC/Mouse_brain_V2_spatial_ATAC_peak_matrix.rds")
#save.label <- "peak"



barcodes <- unlist(lapply(strsplit(colnames(input.atac.V1), "#"), function(x){x[2]}))
barcodes <- gsub("-1", "", barcodes)
meta <- readRDS("~/Desktop/CHARM/AIM/mouse_spleen_ATAC_JUL2023/DBiTseq_2500.coor.RDS")
meta <- meta[match(barcodes, rownames(meta)),]


coor.atac.V1 <- meta[,c("col", "row")]
coor.atac.V1 <- coor.atac.V1+1
#table(unlist(lapply(1:nrow(meta), function(x){grepl(rownames(meta)[x], colnames(input.atac.V1)[x])})))
rownames(coor.atac.V1) <- paste0(coor.atac.V1[,1], "x", coor.atac.V1[,2])
colnames(coor.atac.V1) <- c("array_row", "array_col")
colnames(input.atac.V1) <- rownames(coor.atac.V1)

#coor.atac.V1

input.atac.V1 <- input.atac.V1[,rownames(coor.V1)] 
coor.atac.V1 <- coor.atac.V1[rownames(coor.V1),]


Then we load the second slice (V2) of mouse brain samples.

input <- readRDS("~/functional/Globus/RongLab/MouseBrain/V2transcriptome/mbrnamb2_pca20_res1_dims20.rds")

gene.exp.V2 <- input@assays$Spatial@counts

#get the coordinates
col.names <- colnames(gene.exp.V2)
x <- as.numeric(unlist(lapply(strsplit(col.names, "x"), function(x){x[1]})))
y <- as.numeric(unlist(lapply(strsplit(col.names, "x"), function(x){x[2]})))
coor.V2 <- cbind(x, y)
colnames(coor.V2) <- c("array_row", "array_col")
rownames(coor.V2) <- col.names


#gene.exp.V2, coor.V2

#add the atac information
input.atac.V2 <- readRDS("~/Desktop/KPMP/code/PLIER_Ahmed/speed_sylvester/MultiModality/SpatialCoAssay/RongLab/input/MouseBrain_V2/ATAC/Mouse_brain_V2_spatial_ATAC_peak_matrix.rds")
barcodes <- unlist(lapply(strsplit(colnames(input.atac.V2), "#"), function(x){x[2]}))
barcodes <- gsub("-1", "", barcodes)

meta <- readRDS("~/Desktop/CHARM/AIM/mouse_spleen_ATAC_JUL2023/DBiTseq_2500.coor.RDS")
meta <- meta[match(barcodes, rownames(meta)),]



coor.atac.V2 <- meta[,c("col", "row")]
coor.atac.V2 <- coor.atac.V2+1
#table(unlist(lapply(1:nrow(meta), function(x){grepl(rownames(meta)[x], colnames(input.atac.V2)[x])})))
rownames(coor.atac.V2) <- paste0(coor.atac.V2[,1], "x", coor.atac.V2[,2])
colnames(coor.atac.V2) <- c("array_row", "array_col")
colnames(input.atac.V2) <- rownames(coor.atac.V2)

#coor.atac.V2


input.atac.V2 <- input.atac.V2[,rownames(coor.V2)] 
coor.atac.V2 <- coor.atac.V2[rownames(coor.V2),]


Holistic integration

The general strategy to conduct multi-modal multi-sample integration is to first finish horizontal integration (sample-wise integration for each modality), and then proceed with vertical integration (across modality).

Horizontal integation on the RNA-seq

We preprocess gene expression data using a simple and standard pipeline. The utility function is named as preprocess which provides flexible options to preprocess the spatial omics data.

  • This pipeline keeps all the features instead of only retaining highly-variable features.
  • This pipeline filters out features below certain sparsity threshold and calculate the graph laplacian given the spot coordinates.
fin.V1 <- DAVINCHI::preprocess(gene.exp.V1, coor.V1, type = "rna", graph.opt = "Tri.mesh",  frac.thr = 0.95)

fin.V2 <- DAVINCHI::preprocess(gene.exp.V2, coor.V2, type = "rna", graph.opt = "Tri.mesh",  frac.thr = 0.95)


To initiate the horizontal integration using Horizontal.Integration, we need to construct three list objects,

  • Y.list list of gene expression matrix
  • L.list list of graph laplacian
  • coor.list list of spot coordinates

Horizontal.Integration is the core function to perform horizontal integration with Y.list, L.list and coor.list as input. k.args encodes the number of latent variables corresponding to each sample. k=12 is generally a good choice but you can refer to the other tutorial on how to optimize this parameter.

Y.list <- list(fin.V1$mat, fin.V2$mat)
L.list <- list(fin.V1$L, fin.V2$L)
coor.list <- list(fin.V1$coor, fin.V2$coor)


ptm <- proc.time()
integration.res <- Horizontal.Integration(Y.list, L.list, coor.list, k.args = c(12,12), LVs.filter.thr = 0.8, mod = "all", remove.LV1 = T)
## [1] "L1 is set to 33.8871316128288"
## [1] "L2 is set to 101.661394838487"
## 50  
## [1] "L1 is set to 31.4599804538765"
## [1] "L2 is set to 94.3799413616295"
## 100 50 100
## 200 100 200
## 150 100 200
## 150 100 200
## [1] "L1 is set to 31.4599804538765"
## [1] "L2 is set to 94.3799413616295"
## [1] "L1 is set to 33.8871316128288"
## [1] "L2 is set to 101.661394838487"
print(proc.time()-ptm)  
##    user  system elapsed 
## 616.858  10.265 628.630


Before completing the holistic integration, we can review the clusters based on the RNA-seq integation results. We provide a wrapper function swk to cluster the integrated latent variables and get the spatial niche patterns. Here we used model-based clustering approaches and set the number of clusters as 8. We visualize the spatial clusters using another wrapper function scatter.DiscretePlot.

cluster.RNA <- swk(integration.res$LVs_embeddings, method = "mclust", L2norm = T, mclust.model = "EEE", mclust.num = 8, random.seed = 1)
## [1] "Number of unique clusters is 8"
p.V1 <- scatter.DiscretePlot(coor.V1, as.character(cluster.RNA)[integration.res$slice_id=="1"], pt.size = 2)
p.V2 <- scatter.DiscretePlot(coor.V2, as.character(cluster.RNA)[integration.res$slice_id=="2"], pt.size = 2)

ggarrange(p.V1, p.V2)


Horizontal integation on the ATAC-seq

We preprocess ATAC-seq data using a simple and standard pipeline. The utility function is named as preprocess which provides flexible options to preprocess the spatial omics data in general. LSI normalization is also available as one of the choices.

fin.atac.V1 <- DAVINCHI::preprocess(input.atac.V1, coor.V1, type = "rna", graph.opt = "Tri.mesh",  frac.thr = 0.95)

fin.atac.V2 <- DAVINCHI::preprocess(input.atac.V2, coor.V2, type = "rna", graph.opt = "Tri.mesh",  frac.thr = 0.95)


We first construct three objects Y.list, L.list and coor.list as input to the core function Horizontal.Integration.

Y.list <- list(fin.atac.V1$mat, fin.atac.V2$mat)
L.list <- list(fin.atac.V1$L, fin.atac.V2$L)
coor.list <- list(fin.atac.V1$coor, fin.atac.V2$coor)


ptm <- proc.time()
integration.atac.res <- Horizontal.Integration(Y.list, L.list, coor.list, k.args = c(12,12), LVs.filter.thr = 0.8, mod = "all", remove.LV1 = T)
## [1] "L1 is set to 14.9287924479"
## [1] "L2 is set to 44.7863773436999"
## 25 25 50
## 37.5 25 50
## 31.25 25 37.5
## 31.25 25 37.5
## [1] "L1 is set to 14.9287924479"
## [1] "L2 is set to 44.7863773436999"
## 25 25 50
## 37.5 25 50
## 31.25 25 37.5
## 31.25 25 37.5
## [1] "L1 is set to 14.9287924479"
## [1] "L2 is set to 44.7863773436999"
## [1] "L1 is set to 14.9287924479"
## [1] "L2 is set to 44.7863773436999"
print(proc.time()-ptm) 
##    user  system elapsed 
## 567.754  12.230 581.414


Vertical integation of RNA-seq and ATAC-seq

Here we use BiModalIntegration to integrate the spatal RNA and spatial ATAC, and the input are the latent variables from each modality.

ptm <- proc.time()
vertical.res <- BiModalIntegration(modal.1 = integration.res$LVs_embeddings, modal.2 = integration.atac.res$LVs_embeddings, n_neighbors = 20)
print(proc.time()-ptm)
##    user  system elapsed 
##  19.899   0.249  20.936


The output from BiModalIntegration includes the modality weights specifying the weights for integration per spot. We can use scatter.FeaturePlot again to visualize the weights by position for all slices of all modalities.

p.rna.weight1 <- scatter.FeaturePlot(coor.V1, LVs = vertical.res$weight.1[grepl("1_", names(vertical.res$weight.1))], plot.all = F, pt.size = 1)+ggtitle("RNA weights, V1")

p.atac.weight1 <- scatter.FeaturePlot(coor.V1, LVs = vertical.res$weight.2[grepl("1_", names(vertical.res$weight.2))], plot.all = F, pt.size = 1)+ggtitle("ATAC weights, V1")

p.rna.weight2 <- scatter.FeaturePlot(coor.V2, LVs = vertical.res$weight.1[grepl("2_", names(vertical.res$weight.1))], plot.all = F, pt.size = 1)+ggtitle("RNA weights, V2")

p.atac.weight2 <- scatter.FeaturePlot(coor.V2, LVs = vertical.res$weight.2[grepl("2_", names(vertical.res$weight.2))], plot.all = F, pt.size = 1)+ggtitle("ATAC weights, V2")


ggarrange(plotlist = list(p.rna.weight1, p.atac.weight1, p.rna.weight2, p.atac.weight2), nrow = 2, ncol = 2)


As we integrate both modalities, we can use the integrative latent variables for clustering to achieve a more comprehensive niche assignment. integration.res$snn.mat already encodes the shared nearest neighborhoood graph, which can be fed into community clustering algorithm. Here we are using the Seurat implementation of Leiden algorithm because of its efficiency, but you can use your preferred network-based algorithm instead to cluser, such as leiden::leiden.

ptm <- proc.time()
cluster.integrate <- Seurat:::RunLeiden(vertical.res$snn.mat, resolution.parameter = 0.6, method = "matrix")
#cluster.integrate <- Seurat:::RunModularityClustering(integration.res$snn.mat, resolution = 0.6, algorithm = 1)
print(proc.time()-ptm)
##    user  system elapsed 
##  12.672   5.282  16.697


We use scatter.DiscretePlot to visualize the intergative niches per slice.

p.V1 <- scatter.DiscretePlot(coor.V1, as.character(cluster.integrate)[which(grepl("1_", names(vertical.res$weight.1)))], pt.size = 1)
p.V2 <- scatter.DiscretePlot(coor.V2, as.character(cluster.integrate)[which(grepl("2_", names(vertical.res$weight.1)))], pt.size = 1)
ggarrange(p.V1, p.V2)


After obtaining the integrated niches, we can asses any bias of modality weights per niche to understand how each modality contributes to the niche determination. In this specific example, RNA plays a universally more dominant role compared to ATAC.

weight <- c(vertical.res$weight.1, vertical.res$weight.2)
grp <- c(as.character(cluster.integrate), as.character(cluster.integrate))
modal.name <- c(rep("RNA", length(cluster.integrate)), rep("ATAC", length(cluster.integrate) ))
slice <- rep(unlist(lapply(strsplit(names(vertical.res$weight.1), "_"), function(x){x[1]})), times = 2)

dat.to_plot <- data.frame(weight = weight, grp = grp, modal.name = modal.name, slice = slice)


p.V1 <- ggplot(dat.to_plot[dat.to_plot$slice == "1",], aes(x=grp, y = weight, fill = modal.name))+geom_violin()+theme_pubr(base_size = 20)+xlab("Clusters")+ylab("Modality.weights")+stat_compare_means(method = "wilcox.test", size = 3.5, paired = T)
p.V2 <- ggplot(dat.to_plot[dat.to_plot$slice == "2",], aes(x=grp, y = weight, fill = modal.name))+geom_violin()+theme_pubr(base_size = 20)+xlab("Clusters")+ylab("Modality.weights")+stat_compare_means(method = "wilcox.test", size = 3.5, paired = T)
ggarrange(p.V1, p.V2, nrow = 2)